# 
# Covid-19 Vaccination and Mass Hysteria
# Dohyo Jeong
# Last updated 10/July/2022
# This code is for text preprocessing, 
# Sentiment analysis, frequency analysis, and TF-IDF analysis.
#
#-----------------------------------------------------------------

rm(list=ls())                          # Clear environment
oldpar <- par()                        # save default graphical parameters
if (!is.null(dev.list()["RStudioGD"])) # Clear plot window
  dev.off(dev.list()["RStudioGD"])   
cat("\014")                            # Clear the Console


UStext<-read.csv("eng_total.csv", 
                 header = TRUE, 
                 sep = ",")


# Import libraries
library(tm)
library(xml2)
library(stringr)
library(dplyr)
library(tidytext)
library(ggplot2)

# Let's start Pre-processing!
# URL removal
removeURL <- function(x){
  gsub("http[^[:space:]]*", "", x)
}
# Mention removal
removeMention <- function(x){
  gsub("@\\w+", "", x)
}
# Hashtag removal
removeHashtag <- function(x){
  gsub("#\\S+", "", x)
}
# Carriage removal
removeCarriage <- function(x){
  gsub("[\r\n]", "", x)
}
# Emoticon removal
removeEmoticon <- function(x){
  gsub("[^\x01-\x7F]", "", x)
}
# Retweet removal
removeRT <- function(x){
  gsub("(rt|via)((?:\\b\\W*@\\w+)+)", "", x)
}
# Invoice removal
removeInvoice <- function(x){
  gsub("inv/[0-9]+/+[xvi]+/[xvi]+/[0-9]+", "", x, ignore.case = T)
}
# HTML removal
unescapeHTML <- function(str) {
  xml2::xml_text(xml2::read_html(paste0("<x>", str, "</x>")))
}


----------## text cleaning###------------
#week1-------------
enweek_1 <- subset(UStext, week == 1)

# Read the data
data_enweek_1 <- enweek_1$text

# Work with corpus
tweet_corpus1 <- VCorpus(VectorSource(data_enweek_1))
# Case folding
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(tolower))
# Retweet removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(removeRT))
# Hashtag removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(removeHashtag))
# URL removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(removeURL))
# HTML removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(unescapeHTML))
# Mention removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(removeMention))
# Carriage removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(removeCarriage))
# Emoticon removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(removeEmoticon))
# Invoice removal
tweet_corpus1 <- tm_map(tweet_corpus1,content_transformer(removeInvoice))
# Remove additional symbols to white space
# Eliminate extra white spaces
tweet_corpus1 = tm_map(tweet_corpus1,stripWhitespace)


# Remove english common stopwords
tweet_corpus1 <- tm_map(tweet_corpus1, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
tweet_corpus1 <- tm_map(tweet_corpus1, removeWords, c("rt", "will", "can","la","el","en","ud"))
# Remove punctuations
tweet_corpus1 <- tm_map(tweet_corpus1, removePunctuation)
# Eliminate extra white spaces
tweet_corpus1 <- tm_map(tweet_corpus1, stripWhitespace)
# Text stemming - which reduces words to their root form
tweet_corpus1 <- tm_map(tweet_corpus1, stemDocument)

# Check the final result
inspect(tweet_corpus1[[1]])
# Save as data
df_clean1 <- data.frame(text = sapply(tweet_corpus1,as.character),
                        stringsAsFactors = FALSE) %>%
  mutate(week = 1)

#week2-------------
enweek_2 <- subset(UStext, week == 2)

# Read the data
data_enweek_2 <- enweek_2$text

# Work with corpus
tweet_corpus2 <- VCorpus(VectorSource(data_enweek_2))
# Case folding
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(tolower))
# Retweet removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(removeRT))
# Hashtag removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(removeHashtag))
# URL removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(removeURL))
# HTML removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(unescapeHTML))
# Mention removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(removeMention))
# Carriage removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(removeCarriage))
# Emoticon removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(removeEmoticon))
# Invoice removal
tweet_corpus2 <- tm_map(tweet_corpus2,content_transformer(removeInvoice))
# Remove additional symbols to white space
# Eliminate extra white spaces
tweet_corpus2 = tm_map(tweet_corpus2,stripWhitespace)


# Remove english common stopwords
tweet_corpus2 <- tm_map(tweet_corpus2, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
tweet_corpus2 <- tm_map(tweet_corpus2, removeWords, c("rt", "will", "can","la","el","en","ud"))
# Remove punctuations
tweet_corpus2 <- tm_map(tweet_corpus2, removePunctuation)
# Eliminate extra white spaces
tweet_corpus2 <- tm_map(tweet_corpus2, stripWhitespace)
# Text stemming - which reduces words to their root form
tweet_corpus2 <- tm_map(tweet_corpus2, stemDocument)

# Check the final result
inspect(tweet_corpus2[[1]])
# Save as data
df_clean2 <- data.frame(text = sapply(tweet_corpus2,as.character),
                        stringsAsFactors = FALSE) %>%
  mutate(week = 2)


#Using this frame, repeat the text cleaning processes from week 1 to week 52.
######################
#---------------------------------------------------------
#---------- try to compare------------

# convert to tibble#------------
df_clean1<-df_clean1 %>%
  as_tibble() %>%
  mutate(week=1)
df_clean2<-df_clean2 %>%
  as_tibble() %>%
  mutate(week=2)

### form 1 t0 52 repeat

df_clean50<-df_clean50 %>%
  as_tibble() %>%
  mutate(week=50)
df_clean51<-df_clean51 %>%
  as_tibble() %>%
  mutate(week=51)
df_clean52<-df_clean52 %>%
  as_tibble() %>%
  mutate(week=52)
##As in the previous process, repeat the 'tibble' transformation process from week 1 to week 52 using this frame.
#################
#--------------------------------------------------
#bind rows
bind_keywords<- bind_rows(df_clean1, df_clean2, df_clean3,
                          df_clean4, df_clean5, df_clean6,
                          df_clean7, df_clean8, df_clean9,
                          df_clean10, df_clean11, df_clean12,
                          df_clean13, df_clean14, df_clean15,
                          df_clean16, df_clean17, df_clean18,
                          df_clean19, df_clean20, df_clean21,
                          df_clean22, df_clean23, df_clean24,
                          df_clean25, df_clean26, df_clean27,
                          df_clean28, df_clean29, df_clean30,
                          df_clean31, df_clean32, df_clean33,
                          df_clean34, df_clean35, df_clean36,
                          df_clean37, df_clean38, df_clean39,
                          df_clean40, df_clean41, df_clean42,
                          df_clean43, df_clean44, df_clean45,
                          df_clean46, df_clean47, df_clean48,
                          df_clean49, df_clean50, df_clean51,
                          df_clean52) %>%
  select(text, week)
head(bind_keywords)

#unnest_tokens Function
bind_token<-bind_keywords %>%
  unnest_tokens(input = text,
                output = sentences,
                token = "words")

bind_token<-bind_token %>%
  filter(str_count(word)>1) %>%
  mutate(word=ifelse(word == "se", "side effect", word)) %>%
  mutate(word=ifelse(word == "nn", "neurinoma", word)) %>%
  mutate(word=ifelse(word == "cov", "virus", word)) %>%
  mutate(word=ifelse(word == "covid", "virus", word)) %>%
  mutate(word=ifelse(word == "cov19", "virus", word)) %>%
  mutate(word=ifelse(word == "de", "die", word)) %>%
  mutate(word=ifelse(word == "tat", "finish", word)) %>%
  mutate(word=ifelse(word == "vaccin", "vaccine", word)) %>%
  mutate(word=ifelse(word == "immun", "immunity", word)) %>%
  mutate(word=ifelse(word == "peopl", "people", word)) %>%
  mutate(word=ifelse(word == "avail", "available", word)) %>%
  mutate(word=ifelse(word == "receiv", "receive", word)) %>%
  mutate(word=ifelse(word == "requir", "require", word)) %>%
  mutate(word=ifelse(word == "le", "leave", word)) %>%
  mutate(word=ifelse(word == "sa", "sad", word)) %>%
  mutate(word=ifelse(word == "pa", "pennsylvania", word)) %>%
  mutate(word=ifelse(word == "les", "less", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "mandat", "mandatory", word)) %>%
  filter(word !="235700") %>%
  filter(word !="now") %>%
  filter(word !="one") %>%
  filter(word !="say") 



bind_token %>%
  count(word, sort = TRUE) %>%
  print(n=20)

#Sentiments dictionary-----------------------------------
library(readr)
library(tidyr)

senti<- get_sentiments("afinn")
head(senti)

#----------------by id-----------------------------
# sentiment analysis
bind_keywords<-read_csv("bind.csv")


bind_keywords <- bind_keywords %>%
  mutate(id=row_number())

UStt_senti_word <-bind_keywords %>%
  unnest_tokens(word, text, token = "tweets")

UStt_senti_word<-UStt_senti_word %>%
  filter(str_count(word)>1) %>%
  mutate(word=ifelse(word == "se", "side effect", word)) %>%
  mutate(word=ifelse(word == "nn", "neurinoma", word)) %>%
  mutate(word=ifelse(word == "cov", "virus", word)) %>%
  mutate(word=ifelse(word == "covid", "virus", word)) %>%
  mutate(word=ifelse(word == "de", "die", word)) %>%
  mutate(word=ifelse(word == "tat", "finish", word)) %>%
  mutate(word=ifelse(word == "vaccin", "vaccine", word)) %>%
  mutate(word=ifelse(word == "immun", "immunity", word)) %>%
  mutate(word=ifelse(word == "peopl", "people", word)) %>%
  mutate(word=ifelse(word == "avail", "available", word)) %>%
  mutate(word=ifelse(word == "receiv", "receive", word)) %>%
  mutate(word=ifelse(word == "requir", "require", word)) %>%
  mutate(word=ifelse(word == "le", "leave", word)) %>%
  mutate(word=ifelse(word == "sa", "sad", word)) %>%
  mutate(word=ifelse(word == "pa", "pennsylvania", word)) %>%
  mutate(word=ifelse(word == "les", "less", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "mandat", "mandatory", word)) %>%
  filter(word !="235700") %>%
  filter(word !="now") %>%
  filter(word !="one") %>%
  filter(word !="say") 

UStt_senti_word<-UStt_senti_word %>%
  inner_join(senti)


UStt_senti <- bind_keywords %>%
  left_join(UStt_senti_word %>%
              group_by(id) %>%
              summarise(score=sum(value))) %>%
  replace_na(list(score=0))


#save as text file
#write.table()<- function for save as text file
write.table(UStt_senti,
            file = "US_senti_by_id_clean.txt",
            sep =",",
            row.names = FALSE,
            quote=FALSE)


#-------------------------------------------------------
## frequency and TF-IDF analysis ##

bind_keywords<-read_csv("US_senti_by_id_clean.csv")

bind_keywords <- bind_keywords %>%
  mutate(phase = ifelse(week < 17, "phase_1",
                        ifelse(week >= 16 & week < 41, 
                               "phase_2", "phase_3")))


bind_keywords <- bind_keywords %>%
  mutate(tone = ifelse(score > 0 , "positive",
                       ifelse(score < 0, "negative", "neu")))


key_phase1 <- subset(bind_keywords, phase == "phase_1")
key_phase2 <- subset(bind_keywords, phase == "phase_2")
key_phase3 <- subset(bind_keywords, phase == "phase_3")
#--------------------------------------------------------
################# Total words ###########################
#unnest_tokens Function
bind_token<-bind_keywords %>%
  unnest_tokens(input = text,
                output = word,
                token = "words")
bind_token

# Frequency each keywords
frequency<-bind_token %>%
  count(phase, word) %>%
  filter(str_count(word)>1)
head(frequency)

keytop20 <- frequency %>%
  mutate(word=ifelse(word == "sick", "side effect", word)) %>%
  mutate(word=ifelse(word == "nn", "neurinoma", word)) %>%
  mutate(word=ifelse(word == "cov", "cov19", word)) %>%
  mutate(word=ifelse(word == "de", "die", word)) %>%
  mutate(word=ifelse(word == "tat", "finish", word)) %>%
  mutate(word=ifelse(word == "vaccin", "vaccine", word)) %>%
  mutate(word=ifelse(word == "immun", "immunity", word)) %>%
  mutate(word=ifelse(word == "peopl", "people", word)) %>%
  mutate(word=ifelse(word == "avail", "available", word)) %>%
  mutate(word=ifelse(word == "receiv", "receive", word)) %>%
  mutate(word=ifelse(word == "requir", "require", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "sa", "sad", word)) %>%
  mutate(word=ifelse(word == "les", "less", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "se", "side effect", word)) %>%
  mutate(word=ifelse(word == "mandat", "mandatory", word)) %>%
  filter(word !="saw") %>%
  filter(word !="235700") %>%
  filter(word !="cov19") %>%
  filter(word !="et") %>%
  filter(word !="vaccine") %>%
  filter(word !="now") %>%
  filter(word !="stadium") %>%
  filter(word !="pa") %>%
  filter(word !="one") %>%
  filter(word !="just") %>%
  filter(word !="isnt") %>%
  filter(word !="now") %>%
  filter(word !="like") %>%
  filter(word !="new") %>%
  filter(word !="know") %>%
  filter(word !="one") %>%
  filter(word !="us") %>%
  filter(word !="say") %>%
  filter(word !="go") %>%
  filter(word !="think") %>%
  filter(word !="23") %>%
  filter(word !="24") %>%
  group_by(phase) %>%
  slice_max(n, n=15, with_ties = F)
keytop20

# make compare Freq plot

ggplot(keytop20, aes(x=reorder_within(word, n, phase),
                     y=n,
                     fill= phase)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~phase, scales = "free_y") +
  scale_x_reordered() +
  labs(title = "Word appearance Frequency",
       x=NULL, y=NULL) +
  theme(text = element_text(size = 10),
        title=element_text(size=11))

#--------------------------------------------------------------
# TF-IDF
library(tidytext)
set.seed(2022)
keytop20<- keytop20 %>%
  bind_tf_idf(term = word,
              document = phase,
              n = n) %>%
  arrange(-tf_idf)
keytop20

# tf_idf for "phase_1"
keytop20 %>% filter(phase == "phase_1")
# tf_idf for "phase_2"
keytop20 %>% filter(phase == "phase_2")
# tf_idf for "phase_3"
keytop20 %>% filter(phase == "phase_3")

#make order the graph
TFbar<- keytop20 %>%
  group_by(phase) %>%
  slice_max(tf_idf, n = 10, with_ties = F)

TFbar$phase <- factor(TFbar$phase,
                      levels = c("phase_1", "phase_2", "phase_3"))

# make bar plot
tf_total <- ggplot(TFbar, aes(x=reorder_within(word, tf_idf, phase),
                  y=tf_idf,
                  fill = phase)) +
  geom_col(show.legend = F) +
  coord_flip() +
  facet_wrap(~phase, scales = "free", ncol = 3) +
  scale_x_reordered() +
  labs(title = "TF-IDF on Twitter",
       x=NULL, y=NULL) +
  theme(text = element_text(size = 10),
        title=element_text(size=11))
print(tf_total)

windowsFonts(Times = windowsFont("Times New Roman"))
tf_total +
  theme(text = element_text(family="Times"))

################## Negative snetiment #########################
#--------------------------------------------------------
#unnest_tokens Function
bind_token<-bind_keywords %>%
  unnest_tokens(input = text,
                output = word,
                token = "words")
bind_token
bind_token_ng <- bind_token %>%
  filter(score < 0)

# Frequency each keywords
frequency_ng <- bind_token_ng %>%
  count(phase, word) %>%
  filter(str_count(word)>1)
head(frequency_ng)

keytop20_ng <- frequency_ng %>%
  mutate(word=ifelse(word == "sick", "side effect", word)) %>%
  mutate(word=ifelse(word == "nn", "neurinoma", word)) %>%
  mutate(word=ifelse(word == "cov", "cov19", word)) %>%
  mutate(word=ifelse(word == "de", "die", word)) %>%
  mutate(word=ifelse(word == "tat", "finish", word)) %>%
  mutate(word=ifelse(word == "vaccin", "vaccine", word)) %>%
  mutate(word=ifelse(word == "immun", "immunity", word)) %>%
  mutate(word=ifelse(word == "peopl", "people", word)) %>%
  mutate(word=ifelse(word == "avail", "available", word)) %>%
  mutate(word=ifelse(word == "receiv", "receive", word)) %>%
  mutate(word=ifelse(word == "requir", "require", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "sa", "sad", word)) %>%
  mutate(word=ifelse(word == "les", "less", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "se", "side effect", word)) %>%
  mutate(word=ifelse(word == "mandat", "mandatory", word)) %>%
  filter(word !="saw") %>%
  filter(word !="235700") %>%
  filter(word !="cov19") %>%
  filter(word !="et") %>%
  filter(word !="vaccine") %>%
  filter(word !="now") %>%
  filter(word !="stadium") %>%
  filter(word !="pa") %>%
  filter(word !="one") %>%
  filter(word !="just") %>%
  filter(word !="isnt") %>%
  filter(word !="now") %>%
  filter(word !="like") %>%
  filter(word !="new") %>%
  filter(word !="know") %>%
  filter(word !="one") %>%
  filter(word !="us") %>%
  filter(word !="say") %>%
  filter(word !="go") %>%
  filter(word !="think") %>%
  filter(word !="23") %>%
  filter(word !="24") %>%
  group_by(phase) %>%
  slice_max(n, n=15, with_ties = F)
keytop20_ng

# make compare Freq plot

frq_ng <- ggplot(keytop20_ng, aes(x=reorder_within(word, n, phase),
                     y=n,
                     fill= phase)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~phase, scales = "free_y") +
  scale_x_reordered() +
  labs(title = "Negative Word appearance Frequency",
       x=NULL, y=NULL) +
  theme(text = element_text(size = 10),
        title=element_text(size=11))

windowsFonts(Times = windowsFont("Times New Roman"))
frq_ng +
  theme(text = element_text(family="Times")) +
  theme(axis.text.y = element_text(size = 12))


#--------------------------------------------------------------
# TF-IDF
library(tidytext)
set.seed(2022)
keytop20_ng <- keytop20_ng %>%
  bind_tf_idf(term = word,
              document = phase,
              n = n) %>%
  arrange(-tf_idf)
keytop20_ng

# tf_idf for "phase_1"
keytop20_ng %>% filter(phase == "phase_1")
# tf_idf for "phase_2"
keytop20_ng %>% filter(phase == "phase_2")
# tf_idf for "phase_3"
keytop20_ng %>% filter(phase == "phase_3")

#make order the graph
TFbar_ng <- keytop20_ng %>%
  group_by(phase) %>%
  slice_max(tf_idf, n = 10, with_ties = F)

TFbar_ng$phase <- factor(TFbar_ng$phase,
                      levels = c("phase_1", "phase_2", "phase_3"))

# make bar plot
tf_ng <- ggplot(TFbar_ng, aes(x=reorder_within(word, tf_idf, phase),
                              y=tf_idf,
                              fill = phase)) +
  geom_col(show.legend = F) +
  coord_flip() +
  facet_wrap(~phase, scales = "free", ncol = 3) +
  scale_x_reordered() +
  labs(title = "TF-IDF on Negative Sentiment",
       x=NULL, y=NULL) +
  theme(text = element_text(size = 10),
        title=element_text(size=11))

windowsFonts(Times = windowsFont("Times New Roman"))
tf_ng +
  theme(text = element_text(family="Times")) +
  theme(axis.text.y = element_text(size = 12))


##################### Positive Sentiment #####################
#--------------------------------------------------------
#unnest_tokens Function
bind_token<-bind_keywords %>%
  unnest_tokens(input = text,
                output = word,
                token = "words")
bind_token
bind_token_po <- bind_token %>%
  filter(score > 0)

# Frequency each keywords
frequency_po <- bind_token_po %>%
  count(phase, word) %>%
  filter(str_count(word)>1)
head(frequency_po)

keytop20_po <- frequency_po %>%
  mutate(word=ifelse(word == "sick", "side effect", word)) %>%
  mutate(word=ifelse(word == "nn", "neurinoma", word)) %>%
  mutate(word=ifelse(word == "cov", "cov19", word)) %>%
  mutate(word=ifelse(word == "de", "die", word)) %>%
  mutate(word=ifelse(word == "tat", "finish", word)) %>%
  mutate(word=ifelse(word == "vaccin", "vaccine", word)) %>%
  mutate(word=ifelse(word == "immun", "immunity", word)) %>%
  mutate(word=ifelse(word == "peopl", "people", word)) %>%
  mutate(word=ifelse(word == "avail", "available", word)) %>%
  mutate(word=ifelse(word == "receiv", "receive", word)) %>%
  mutate(word=ifelse(word == "requir", "require", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "sa", "sad", word)) %>%
  mutate(word=ifelse(word == "les", "less", word)) %>%
  mutate(word=ifelse(word == "ben", "benefit", word)) %>%
  mutate(word=ifelse(word == "se", "side effect", word)) %>%
  mutate(word=ifelse(word == "mandat", "mandatory", word)) %>%
  filter(word !="saw") %>%
  filter(word !="235700") %>%
  filter(word !="cov19") %>%
  filter(word !="et") %>%
  filter(word !="vaccine") %>%
  filter(word !="now") %>%
  filter(word !="stadium") %>%
  filter(word !="pa") %>%
  filter(word !="one") %>%
  filter(word !="just") %>%
  filter(word !="isnt") %>%
  filter(word !="now") %>%
  filter(word !="like") %>%
  filter(word !="new") %>%
  filter(word !="know") %>%
  filter(word !="one") %>%
  filter(word !="us") %>%
  filter(word !="say") %>%
  filter(word !="go") %>%
  filter(word !="think") %>%
  filter(word !="23") %>%
  filter(word !="24") %>%
  group_by(phase) %>%
  slice_max(n, n=15, with_ties = F)
keytop20_po

# make compare Freq plot

frq_po <- ggplot(keytop20_po, aes(x=reorder_within(word, n, phase),
                                  y=n,
                                  fill= phase)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~phase, scales = "free_y") +
  scale_x_reordered() +
  labs(title = "Positive Word appearance Frequency",
       x=NULL, y=NULL) +
  theme(text = element_text(size = 10),
        title=element_text(size=11))

windowsFonts(Times = windowsFont("Times New Roman"))
frq_po +
  theme(text = element_text(family="Times")) + 
  theme(axis.text.y = element_text(size = 12))

#--------------------------------------------------------------
# TF-IDF
library(tidytext)
set.seed(2022)
keytop20_po <- keytop20_po %>%
  bind_tf_idf(term = word,
              document = phase,
              n = n) %>%
  arrange(-tf_idf)
keytop20_po

# tf_idf for "phase_1"
keytop20_po %>% filter(phase == "phase_1")
# tf_idf for "phase_2"
keytop20_po %>% filter(phase == "phase_2")
# tf_idf for "phase_3"
keytop20_po %>% filter(phase == "phase_3")

#make order the graph
TFbar_po <- keytop20_po %>%
  group_by(phase) %>%
  slice_max(tf_idf, n = 10, with_ties = F)

TFbar_po$phase <- factor(TFbar_po$phase,
                         levels = c("phase_1", "phase_2", "phase_3"))

# make bar plot
tf_po <- ggplot(TFbar_po, aes(x=reorder_within(word, tf_idf, phase),
                              y=tf_idf,
                              fill = phase)) +
  geom_col(show.legend = F) +
  coord_flip() +
  facet_wrap(~phase, scales = "free", ncol = 3) +
  scale_x_reordered() +
  labs(title = "TF-IDF on Positive Sentiment",
       x=NULL, y=NULL) +
  theme(text = element_text(size = 10),
        title=element_text(size=11))

windowsFonts(Times = windowsFont("Times New Roman"))
tf_po +
  theme(text = element_text(family="Times")) +
  theme(axis.text.y = element_text(size = 12))



#-----end----------------
#Based on this, the average sentiment score for each state was reorganized. The file is "US_senti_total_score".
#######
